I’ll try and find a decent hyperparameter set for the convnet. I’ll also try a separable_2d_conv layer.
Initially just switching the separable_2d_conv with the usual 2d_conv
arab_data <- readRDS("../data/scaled_arab_TF_one_channel.RDS")
new_order <- sample(1:8704)
shuffled_data <- list(
x = arab_data$x[new_order,,,, drop = F],
y = as.numeric(arab_data$y[new_order])
)
train_i <- 1:6900
val_i <- 6901:7802
test_i <- 7803:8704
x_train <- shuffled_data$x[train_i,,, ,drop = F]
x_val <- shuffled_data$x[val_i,,, ,drop = F]
x_test <- shuffled_data$x[test_i,,, ,drop = F]
y_train <- shuffled_data$y[train_i]
y_val <- shuffled_data$y[val_i]
y_test <- shuffled_data$y[test_i]
model <- tryCatch({
keras_model_sequential() %>%
layer_separable_conv_2d(filters = 32, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 64, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
},
warning = function(w) {
print(paste("Warning:", w))
},
error = function(e){
print(paste("Error:", e))
}
)
summary(model)
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## separable_conv2d_1 (SeparableCon (None, 2, 232, 32) 74
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D) (None, 1, 116, 32) 0
## ___________________________________________________________________________
## separable_conv2d_2 (SeparableCon (None, 1, 116, 64) 2432
## ___________________________________________________________________________
## flatten_1 (Flatten) (None, 7424) 0
## ___________________________________________________________________________
## dense_1 (Dense) (None, 64) 475200
## ___________________________________________________________________________
## dense_2 (Dense) (None, 1) 65
## ===========================================================================
## Total params: 477,771
## Trainable params: 477,771
## Non-trainable params: 0
## ___________________________________________________________________________
tryCatch({
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- model %>% fit(
x_train,
y_train,
epochs = 20,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
},
error = function(e){
print(paste("error: ", e))
}
)
Similar result as with the conv_2d layer, but perhaps a bit smoother. Validation curve is nicely tracking the training, still. The acc is still rising, the loss still dropping. The number of epochs might go up.
I shall try building random structures as per the dense network and see what works best. Not all model structures will work. Lets quickly try and build them and see which will
random_structure <- function(max_layers = 4, max_nodes = 64){
layer_sizes <- 2 ^ (3:6) #layer size will be a power of 2, max will be 64
layer_sizes <- layer_sizes[layer_sizes <= max_nodes]
layers <- sample(2:max_layers, 1)
list( layers = layers,
nodes_per_layer = sample(layer_sizes, layers, replace = TRUE), #minimum layer size is 2,
dense_layer = sample(layer_sizes, 1)
)
}
make_model <- function(structure = list( layers=2, nodes_per_layer = c(32,32,64) ) ){
m <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = structure$nodes_per_layer[1], kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2))
for (l in 2:structure$layers){
m <- m %>% layer_separable_conv_2d(filters = structure$nodes_per_layer[l], kernel_size = c(2,5), activation = "relu", padding = "same")
if (! l == structure$layers){
m <- m %>% layer_max_pooling_2d(pool_size = c(2,2))
}
}
m <- m %>%
layer_flatten() %>%
layer_dense(units = structure$dense_layer, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
m %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
m
}
if ( file.exists("valid_convnets.RDS")){
df <- readRDS("valid_convnets.RDS")
} else {
df <- data.frame(layers = integer(), nodes_per_layer = integer(), dense = integer(), is_valid = logical() )
for(l in 1:500){
message(paste("trying ", l))
structure = random_structure()
r <- try( make_model(structure) )
if (class(r) == "try-error"){
r <- list(s = structure, result = FALSE)
}
else{
r <- list(s = structure, result = TRUE)
}
d <- data.frame(
run_id = rep(l, r$s$layers),
layers = rep(r$s$layers, r$s$layers),
layer_id = 1:r$s$layers,
nodes_per_layer = r$s$nodes_per_layer,
dense = rep(r$s$dense_layer, r$s$layers),
is_valid = rep(r$result, r$s$layers)
)
df <- dplyr::bind_rows(df, d)
}
saveRDS(df, file = "valid_convnets.RDS")
}
df %>% dplyr::filter(is_valid == TRUE) %>%
summarize(max(layers))
## max(layers)
## 1 2
Ok, so in 500 random layer structures, only those with two convolutional layers would even compile. Fair enough. Lets see how many different structures we get
structure_info <- df %>% dplyr::filter(is_valid == TRUE, layers == 2) %>%
group_by(run_id) %>%
mutate(tag = paste0(first(nodes_per_layer), "_", last(nodes_per_layer), "_", dense)) %>%
select(tag)
## Adding missing grouping variables: `run_id`
unique(structure_info$tag)
## [1] "32_32_8" "32_16_8" "32_32_32" "8_64_64" "64_32_16" "32_8_32"
## [7] "8_32_64" "32_64_64" "8_16_8" "64_16_16" "8_16_32" "64_8_32"
## [13] "16_16_8" "64_8_8" "16_32_8" "16_64_8" "64_64_32" "16_64_32"
## [19] "64_32_64" "8_64_32" "64_8_64" "32_16_16" "32_16_32" "8_8_8"
## [25] "64_64_64" "32_32_64" "16_8_8" "16_64_16" "8_32_8" "8_16_64"
## [31] "16_32_32" "32_8_64" "32_32_16" "64_32_32" "32_16_64" "64_16_32"
## [37] "64_16_64" "8_8_32" "8_64_16" "16_8_64" "64_64_8" "16_8_32"
## [43] "8_8_64" "16_16_64" "16_32_64" "8_16_16" "8_32_16" "8_32_32"
## [49] "32_8_16" "16_16_32" "64_16_8" "32_64_16" "16_64_64" "64_8_16"
## [55] "8_64_8" "32_64_32" "32_8_8" "32_64_8"
And there are 58 different structures. I’ll extract and try them all!
structure_info <- structure_info %>%
ungroup() %>%
select(tag) %>%
distinct(tag) %>%
separate(tag, into = c("layer1", "layer2", "dense"), sep = "_")
structure_info
## # A tibble: 58 x 3
## layer1 layer2 dense
## <chr> <chr> <chr>
## 1 32 32 8
## 2 32 16 8
## 3 32 32 32
## 4 8 64 64
## 5 64 32 16
## 6 32 8 32
## 7 8 32 64
## 8 32 64 64
## 9 8 16 8
## 10 64 16 16
## # ... with 48 more rows
# given a model and training and valiidation data returns the accuracy for a run
evaluate_model <- function(mod, x_train = NULL, y_train = NULL,
x_val = NULL, y_val = NULL,
epochs = 20, batch_size = 512,
verbose = 0, full_result = FALSE){
mod %>% fit(
x_train,
y_train,
epochs = epochs,
batch_size = batch_size,
verbose = verbose
)
result <- mod %>% evaluate(
x_val,
y_val,
verbose = verbose
)
if (full_result){
return(result)
}
else {
return(result$acc)
}
}
model_accuracy <- function(structure,
training_validation_x, training_validation_y, #does not include test data, only that to be k-foldes
epochs = 20, batch_size = 512,
verbose = 0){
accuracy <- c()
#start k-fold here
k <- 4
indices <- sample(1:nrow(training_validation_x))
folds <- cut(indices, breaks = k, labels = FALSE)
for (i in 1:k){
val_ind <- which(folds == i, arr.ind = TRUE)
x_val <- training_validation_x[val_ind,,,, drop = F]
y_val <- training_validation_y[val_ind]
x_train <- training_validation_x[-val_ind,,,, drop = F]
y_train <-training_validation_y[-val_ind]
tryCatch({
m <- make_model(structure = structure )
tryCatch(
{
a <- evaluate_model(m,
x_train = x_train, y_train = y_train,
x_val = x_val, y_val = y_val,
epochs = epochs, batch_size = batch_size,
verbose = verbose
)
accuracy <- c(accuracy, a)
},
warning = function(w){},
error = function(e){
cat("died evaluating model")
accuracy <- c(accuracy, NA)
}
)
},
warning = function(w){ },
error = function(e){
cat(paste("died making model", e))
accuracy <- c(accuracy, NA)
}
)
}
#end x-fold
mean(accuracy, na.rm = TRUE)
}
run_models<- function(structure_info, outfile = "convnet_2_layer_models.RDS"){
arab_data <- readRDS("../data/scaled_arab_TF_one_channel.RDS")
new_order <- sample(1:8704)
shuffled_data <- list(
x = arab_data$x[new_order,,,, drop = F],
y = as.numeric(arab_data$y[new_order])
)
result <- data.frame(model_id = integer(),
layer_id = integer(),
neurons_in_layer = integer(),
dense_layer = integer(),
layers = integer(),
accuracy = double()
)
for(model_id in 1:nrow(structure_info)){
message(" running sample ", model_id)
structure <- list(layers = 2, nodes_per_layer = structure_info[model_id, 1:2], dense_layer = structure_info[model_id, 3])
accuracy <- model_accuracy(structure,
training_validation_x = shuffled_data$x[1:7802,,,, drop = F],
training_validation_y = shuffled_data$y[1:7802]
)
r <- data.frame(
model_id = rep(model_id, structure$layers),
layer_id = 1:structure$layers,
neurons_in_layer = as.integer(structure$nodes_per_layer),
layers = rep(structure$layers, structure$layers),
dense_layer = as.integer( rep(structure$dense_layer, structure$layers) ),
accuracy = rep(accuracy, structure$layers)
)
result <- dplyr::bind_rows(result, r)
}
saveRDS(result, outfile)
}
if ( ! file.exists("convnet_2_layer_models.RDS") ){
run_models(structure_info, outfile = "convnet_2_layer_models.RDS")
}
readRDS("convnet_2_layer_models.RDS") %>%
group_by(model_id) %>%
transmute(nodes_in_first_layer = first(neurons_in_layer), nodes_in_second_layer = last(neurons_in_layer), nodes_in_dense_layer = first(dense_layer), accuracy = accuracy ) %>% ungroup() %>%
select(-model_id) %>% distinct() %>% arrange( desc(nodes_in_first_layer)) %>%
plot_ly( x = ~nodes_in_first_layer, y = ~nodes_in_second_layer, z = ~nodes_in_dense_layer, marker = list(color = ~accuracy, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
add_markers() %>%
layout()
readRDS("convnet_2_layer_models.RDS") %>%
group_by(model_id) %>%
transmute(nodes_in_first_layer = first(neurons_in_layer), nodes_in_second_layer = last(neurons_in_layer), nodes_in_dense_layer = first(dense_layer), accuracy = accuracy ) %>% ungroup() %>%
select(-model_id) %>% distinct() %>% arrange( desc(accuracy))
## # A tibble: 58 x 4
## nodes_in_first_layer nodes_in_second_layer nodes_in_dense_lay… accuracy
## <int> <int> <int> <dbl>
## 1 16 32 64 0.925
## 2 16 64 32 0.923
## 3 16 64 64 0.923
## 4 8 64 64 0.919
## 5 8 64 32 0.916
## 6 32 64 64 0.915
## 7 16 16 64 0.915
## 8 64 32 32 0.914
## 9 16 32 32 0.913
## 10 64 64 64 0.913
## # ... with 48 more rows
The best models all have a mid (~16) first conv layer nodes and the second higher. There’s not much in it, but the absolute best has 16,32 and 64 in the dense layer. I’ll go with that as ‘optimum’.
final_model <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = 16, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 32, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
summary(final_model)
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## separable_conv2d_1 (SeparableCon (None, 2, 232, 16) 42
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPooling2D) (None, 1, 116, 16) 0
## ___________________________________________________________________________
## separable_conv2d_2 (SeparableCon (None, 1, 116, 32) 704
## ___________________________________________________________________________
## flatten_1 (Flatten) (None, 3712) 0
## ___________________________________________________________________________
## dense_1 (Dense) (None, 64) 237632
## ___________________________________________________________________________
## dense_2 (Dense) (None, 1) 65
## ===========================================================================
## Total params: 238,443
## Trainable params: 238,443
## Non-trainable params: 0
## ___________________________________________________________________________
#
final_model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- final_model %>% fit(
x_train,
y_train,
epochs = 20,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
Looks ok. Still the accuracy is increasing. I’ll massively overtrain and see what happens
final_model <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = 16, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 32, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
#
final_model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- final_model %>% fit(
x_train,
y_train,
epochs = 100,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
This goes bad quick. Looks like under 20 is a decent number of epochs. To try and reduce the chance of overfitting I’d like to try the smallest similar model. As the results from above are all so close, I’ll try something much smaller.
final_model <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = 8, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 32, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
#
final_model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- final_model %>% fit(
x_train,
y_train,
epochs = 100,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
Goes bad slower. But still overtrains and not far off the same accuracy. The simpler model is better (Occam). So one more go at a stupid simple model.
final_model <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = 4, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 8, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 8, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
#
final_model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
history <- final_model %>% fit(
x_train,
y_train,
epochs = 100,
batch_size = 512,
validation_data = list(x_val, y_val)
)
plot(history)
OK, so this looks better. Taking slightly longer to train, approx 40 epochs. but tracking training and validation really nicely. Let’s take the 4,8,8 as a final model.
final_model <- keras_model_sequential() %>%
layer_separable_conv_2d(filters = 4, kernel_size = c(2,5), activation = "relu", input_shape = c(2,232,1), padding = "same") %>%
layer_max_pooling_2d(pool_size = c(2,2)) %>%
layer_separable_conv_2d(filters = 8, kernel_size = c(2,5), activation = "relu", padding = "same") %>%
layer_flatten() %>%
layer_dense(units = 8, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
#
final_model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy")
)
#
final_model %>% fit(
x_train,
y_train,
epochs = 30,
batch_size = 512,
validation_data = list(x_val, y_val)
)
final_model %>% evaluate(x_test, y_test)
## $loss
## [1] 0.2422469
##
## $acc
## [1] 0.8924612
save_model_hdf5(final_model, filepath = "../data/convnet_model.hdf5")